This notebook defines the most focal recurrent copy number units by removing focal changes that are within entire chromosome arm losses and gains. Most focal here meaning:
This notebook is intended to be run from the command line with the following (assumes you are in the root directory of the repository):
Rscript -e "rmarkdown::render('analyses/focal-cn-file-preparation/05-define-most-focal-cn-units.Rmd', clean = TRUE)"
# The percentage of calls a particular status needs to be
# above to be called the majority status -- the decision
# for a cutoff of 90% here was made to ensure that the status
# is not only the majority status but it is also significantly
# called more than the other status values in the region
percent_threshold <- 0.9
# The percentage threshold for determining if enough of a region
# (arm, cytoband, or gene) is callable to determine its status --
# the decision for a cutoff of 50% here was made as it seems reasonable
# to expect a region to be more than 50% callable for a dominant status
# call to be made
uncallable_threshold <- 0.5
library(tidyverse)
[30m── [1mAttaching packages[22m ────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──[39m
[30m[32m✔[30m [34mggplot2[30m 3.2.0 [32m✔[30m [34mpurrr [30m 0.3.2
[32m✔[30m [34mtibble [30m 2.1.3 [32m✔[30m [34mdplyr [30m 0.8.3
[32m✔[30m [34mtidyr [30m 0.8.3 [32m✔[30m [34mstringr[30m 1.4.0
[32m✔[30m [34mreadr [30m 1.3.1 [32m✔[30m [34mforcats[30m 0.4.0[39m
[30m── [1mConflicts[22m ───────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[30m [34mdplyr[30m::[32mfilter()[30m masks [34mstats[30m::filter()
[31m✖[30m [34mdplyr[30m::[32mlag()[30m masks [34mstats[30m::lag()[39m
status_majority_caller <- function(status_df,
region_variable,
status_column_name,
id_column_name,
percent_threshold_val,
uncallable_threshold_val = uncallable_threshold) {
# Given a data.frame with cytoband/gene-level copy number status data,
# find the dominant status of the cytoband/gene region by calculating
# the percentages of the region that each call represents.
#
# Args:
# status_df: data.frame with cytoband/gene-level copy number status data
# region_variable: string of the column name/region to calculate copy number
# status percentages for
# status_column_name: string of the column name that holds the relevant copy
# number status data
# id_column_name: string of the column name that holds the sample id values
# percent_threshold_val: What percent of calls a particular status needs to be
# above to be called the majority.
# uncallable_threshold_val: a threshold for determining if enough of a region is
# callable to determine its status.
#
# Return:
# status_count: data.frame with percentage values for each unique status in
# each unique region (arm/cytoband/gene) and the dominant
# status for that region
# Tidyeval for these columns
region_sym <- rlang::sym(region_variable)
status_sym <- rlang::sym(status_column_name)
id_sym <- rlang::sym(id_column_name)
# Format the data and group it
status_df <- status_df %>%
# Keep only the columns we need
select(!!id_sym, !!region_sym, !!status_sym) %>%
# Group by status, sample, and region
group_by(!!id_sym, !!region_sym, !!status_sym) %>%
# Get a total count of each s
summarize(status_n = n())
# Getting total counts by region
region_total_df <- status_df %>%
# Group by region and for each sample
group_by(!!region_sym, !!id_sym) %>%
# Get totals
summarize(total = sum(status_n))
# Calculate percent by using the region totals
status_percent_df <- status_df %>%
ungroup() %>%
# Tack on the by region totals
inner_join(region_total_df, by = c(region_variable, id_column_name)) %>%
# Bring back our region variable as its own column
mutate(percent = status_n / total) %>%
# Don't need these columns now
select(-status_n, -total) %>%
# Spread to individual columns
spread(status_column_name, percent) %>%
# Turn NAs into 0s
replace_na(list(
gain = 0,
loss = 0,
neutral = 0,
amplification = 0,
uncallable = 0,
unstable = 0
))
# The logic here is to define the region status based on the majority of calls
# in the region -- if the number of calls for a specific status is
# responsible for more than `percent_threshold` value of the total calls in that
# region, then it is used to define the region's status (exception is for the
# uncallable status where we define the region as uncallable when more than the
# `uncallable_threshold` of the calls in that region are uncallable)
if ((region_variable == "chromosome_arm") | (region_variable == "gene_symbol")) {
status_percent_df <- status_percent_df %>%
mutate(
dominant_status = case_when(
gain > percent_threshold_val ~ "gain",
loss > percent_threshold_val ~ "loss",
amplification > percent_threshold_val ~ "amplification",
TRUE ~ "neutral"
)
)
} else if (region_variable == "cytoband") {
status_percent_df <- status_percent_df %>%
mutate(
dominant_status = case_when(
uncallable > uncallable_threshold_val ~ "uncallable",
gain > percent_threshold_val ~ "gain",
loss > percent_threshold_val ~ "loss",
neutral > percent_threshold_val ~ "neutral",
TRUE ~ "unstable"
)
)
}
return(status_percent_df)
}
plot_dominant_status_calls <- function(status_count_df,
region_variable) {
# Given a data.frame with the percentage values for each region and the
# dominant status for that region, plot the dominant status call on the
# x-axis with the percentage values on the y-axis.
#
# Args:
# status_count_df: data.frame with percentage values for each unique status
# in each unique region (arm/cytoband/gene) and the
# dominant status for that region
# region_variable: string of the region (which will also be a column name)
# that the data.frame holds percentage values for
# Return:
# status_plot: plot representing the dominant status call on the x-axis and
# the percentage values on the y-axis
status_count_df %>%
# Remove the columns we don't want to plot
select(-region_variable, -biospecimen_id) %>%
dplyr::ungroup() %>%
# Gather the data.frame to have columns and values in the format of
# our status call, the percentage of total calls that status call is
# responisble for, and the dominant status call made based on the
# percentage value
tidyr::gather(status, percent,-dominant_status) %>%
# Plot our focal status values on the x-axis and the percentage values
# on the y-axis
ggplot2::ggplot(ggplot2::aes(x = status,
y = percent)) +
ggplot2::geom_jitter() +
# Facet wrap around each dominant status value
ggplot2::facet_wrap( ~ dominant_status)
}
results_dir <- "results"
# Define a logical object for running in CI
running_in_ci <- params$is_ci
Read in cytoband status file and format it for what we will need in this notebook.
# Read in the file with consensus CN status data and the UCSC cytoband data --
# generated in `03-add-cytoband-status-consensus.Rmd`
consensus_seg_cytoband_status_df <-
read_tsv(file.path("results", "consensus_seg_with_ucsc_cytoband_status.tsv.gz")) %>%
# Need this to not have `chr`
mutate(chr = gsub("chr", "", chr),
cytoband = paste0(chr, cytoband)) %>%
select(
chromosome_arm,
# Distinguish this dominant status that is based on cytobands, from the status
dominant_cytoband_status = dominant_status,
cytoband,
Kids_First_Biospecimen_ID
)
Parsed with column specification:
cols(
Kids_First_Biospecimen_ID = [31mcol_character()[39m,
chr = [31mcol_character()[39m,
cytoband = [31mcol_character()[39m,
dominant_status = [31mcol_character()[39m,
band_length = [32mcol_double()[39m,
callable_fraction = [32mcol_double()[39m,
gain_fraction = [32mcol_double()[39m,
loss_fraction = [32mcol_double()[39m,
chromosome_arm = [31mcol_character()[39m
)
Read in the chromosome-level and gene-level data.
# Read in the annotated CN file (without the UCSC data)
consensus_seg_autosomes_df <-
read_tsv(file.path(results_dir, "consensus_seg_annotated_cn_autosomes.tsv.gz"))
Parsed with column specification:
cols(
biospecimen_id = [31mcol_character()[39m,
status = [31mcol_character()[39m,
copy_number = [32mcol_double()[39m,
ploidy = [32mcol_double()[39m,
ensembl = [31mcol_character()[39m,
gene_symbol = [31mcol_character()[39m,
cytoband = [31mcol_character()[39m
)
Joining the gene-level, cytoband-level, and arm-level data into one data frame.
combined_status_df <- consensus_seg_autosomes_df %>%
inner_join(
consensus_seg_cytoband_status_df,
by = c("biospecimen_id" = "Kids_First_Biospecimen_ID", "cytoband")
)
# Use our custom function to make the status calls
arm_status_count <- status_majority_caller(
combined_status_df,
"chromosome_arm",
"status",
"biospecimen_id",
percent_threshold_val = percent_threshold
)
# Display table
arm_status_count %>%
group_by(dominant_status)
These are the chromosome arms that have not been defined as gain or loss – we want to define their cytoband/gene-level status
# Let's get a vector of the neutral arms
neutral_arms <- arm_status_count %>%
filter(dominant_status == "neutral") %>%
dplyr::pull("chromosome_arm")
We want to include cytoband and gene-level calls for chromosome arms that have not been defined as a gain or loss.
# Filter the annotated CN data to include only neutral chromosome arms
neutral_status_arm_df <- combined_status_df %>%
filter(chromosome_arm %in% neutral_arms)
Making cytoband-level majority calls.
if (!(running_in_ci)) {
# Now count the cytoband level calls (for each status call) and define
# the cytoband as that status if more than 50% of the total counts are
# for that particular status
cytoband_status_count <- status_majority_caller(
neutral_status_arm_df,
"cytoband",
"dominant_cytoband_status",
"biospecimen_id",
percent_threshold_val = percent_threshold
)
# Display table
cytoband_status_count %>%
group_by(dominant_status)
}
Plot the final dominant status call on the x-axis and the percent of each status on the y-axis.
rr # Plot if not running in circleCI if (!(running_in_ci)) { plot_dominant_status_calls(gene_status_count, _symbol) }
# Run `plot_dominant_status_calls` function for the cytoband calls if not
# running in CI
if (!(running_in_ci)) {
plot_dominant_status_calls(cytoband_status_count,
"cytoband")
}
# Plot if not running in circleCI
if (!(running_in_ci)) {
plot_dominant_status_calls(gene_status_count,
"gene_symbol")
}
if (!(running_in_ci)) {
final_gene_status_df <- combined_status_count_df %>%
# Filter to only neutral chromosome arms and cytobands
# that are NA or unstable (do not have a clear gain or loss call)
filter(dominant_status.arm == "neutral",
dominant_status.cytoband == c("NA", "unstable")) %>%
select(
Kids_First_Biospecimen_ID = biospecimen_id,
region = gene_symbol,
status = dominant_status
) %>%
distinct()
}
sessionInfo()
R version 3.6.0 (2019-04-26)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Debian GNU/Linux 9 (stretch)
Matrix products: default
BLAS/LAPACK: /usr/lib/libopenblasp-r0.2.19.so
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=C LC_PAPER=en_US.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] forcats_0.4.0 stringr_1.4.0 dplyr_0.8.3 purrr_0.3.2 readr_1.3.1 tidyr_0.8.3 tibble_2.1.3
[8] ggplot2_3.2.0 tidyverse_1.2.1
loaded via a namespace (and not attached):
[1] Rcpp_1.0.1 cellranger_1.1.0 pillar_1.4.2 compiler_3.6.0 tools_3.6.0 jsonlite_1.6
[7] lubridate_1.7.4 nlme_3.1-140 gtable_0.3.0 lattice_0.20-38 pkgconfig_2.0.2 rlang_0.4.0
[13] cli_1.1.0 rstudioapi_0.10 yaml_2.2.0 haven_2.1.1 xfun_0.8 withr_2.1.2
[19] xml2_1.2.0 httr_1.4.0 knitr_1.23 generics_0.0.2 hms_0.4.2 grid_3.6.0
[25] tidyselect_0.2.5 glue_1.3.1 R6_2.4.0 readxl_1.3.1 modelr_0.1.4 magrittr_1.5
[31] backports_1.1.4 scales_1.0.0 rvest_0.3.4 assertthat_0.2.1 colorspace_1.4-1 stringi_1.4.3
[37] lazyeval_0.2.2 munsell_0.5.0 broom_0.5.2 crayon_1.3.4